this contains multiple timepoints http://rismyhammer.com/ml/Pre-Processing.html#imputation
slim <- read.csv("/Users/emily/projects/research/Stanislawski/BMI_risk_scores/new_meta_data/DRIFT_meta_deltas_03.12.2024.csv")
all <- read.csv("/Users/emily/projects/research/Stanislawski/BMI_risk_scores/new_meta_data/DRIFT_meta_longitudinal_03.12.2024.csv")
all_deltas <- read_excel("/Users/emily/projects/research/Stanislawski/BMI_risk_scores/new_meta_data/DRIFT2\ Analysis\ Data\ sets.xlsx", sheet = "longitudinal_delta")
long_survey <- read_excel("/Users/emily/projects/research/Stanislawski/BMI_risk_scores/new_meta_data/DRIFT2\ Analysis\ Data\ sets.xlsx", sheet = "longitudinal_survey")
meta <- read_csv("~/projects/research/Stanislawski/BMI_risk_scores/data/correct_meta_files/ashleys_meta/DRIFT_working_dataset_meta_deltas_filtered_05.21.2024.csv")
## Rows: 165 Columns: 256
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): subject_id, Intervention, consent
## dbl (213): record_id, randomized_group, cohort_number, age, sex, gender, rac...
## lgl (40): WBTOT_FAT_3m, WBTOT_LEANmass_3m, WBTOT_BMC_3m, WBTOT_Lean_BMC_3m,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Replace spaces with underscores in column names
colnames(meta) <- gsub(" ", "_", colnames(meta))
# Drop non-consenting individuals
meta <- meta[meta$consent != "no", ]
metS,
# Stratify by sex
meta_male <- meta %>% dplyr::filter(meta$sex == "1")
meta_female <- meta %>% dplyr::filter(meta$sex == "0")
# Glucose,
meta$met_gluc <- ifelse(meta$Glucose_BL >= 100, 1, 0)
meta$met_tg <- ifelse(meta$Triglyceride_lipid_BL >= 150, 1, 0)
meta$met_bp <- ifelse(meta$avg_systolic_BL >= 130, 1,
ifelse(meta$avg_diastolic_BL >= 85, 1, 0))
# Sex diff hdl
meta_female$met_hdl <- ifelse(meta_female$HDL_Total_Direct_lipid_BL <= 50, 1, 0)
meta_male$met_hdl <- ifelse(meta_male$HDL_Total_Direct_lipid_BL <= 40, 1, 0)
# Sex diff waist circ
meta_female$met_wc <- ifelse(meta_female$wc_average_BL >= 89, 1, 0)
meta_male$met_wc <- ifelse(meta_male$wc_average_BL >= 102, 1, 0)
# merge
mf <- meta_female %>% dplyr::select(record_id, met_hdl, met_wc)
mm <- meta_male %>% dplyr::select(record_id, met_hdl, met_wc)
mfm <- rbind(mf, mm)
mmm <- merge(mfm, meta, by = "record_id")
# Calculate the sum of the specified columns and create the new variable
# criteria is greater than 3
mmm$metS_BL <- ifelse(rowSums(mmm[c("met_hdl", "met_wc", "met_gluc",
"met_tg", "met_bp")]) >= 3,
1, 0)
meta <- mmm
remove(mmm, mm, mfm, mf, meta_female, meta_male)
Predictor variables will be only baseline for aim 1, and longitudinal for aim 2. That means two separate data sets.
removed some highly correlated variables, 18m columns and non-completer
a2 <- meta %>% dplyr::select(c(record_id, subject_id, randomized_group, consent,
cohort_number, sex, race, age, completer,
# BL
outcome_BMI_fnl_BL, Glucose_BL, HOMA_IR_BL,
Insulin_endo_BL, HDL_Total_Direct_lipid_BL,
LDL_Calculated_BL, Triglyceride_lipid_BL,
# 3m
#outcome_BMI_fnl_3m, Glucose_3m, HOMA_IR_3m,
#Insulin_endo_3m, HDL_Total_Direct_lipid_3m,
#LDL_Calculated_3m, Triglyceride_lipid_3m,
# 6m
outcome_BMI_fnl_6m, Glucose_6m, HOMA_IR_6m,
Insulin_endo_6m, HDL_Total_Direct_lipid_6m,
LDL_Calculated_6m, Triglyceride_lipid_6m,
# 12m
outcome_BMI_fnl_12m, Glucose_12m, HOMA_IR_12m,
Insulin_endo_12m, HDL_Total_Direct_lipid_12m,
LDL_Calculated_12m, Triglyceride_lipid_12m))
# Remove non - 12 month completers
a2_meta <- a2 %>%
filter(completer != 0) %>%
dplyr::select(-c(completer, consent)) %>%
mutate(across(where(is.logical), as.numeric)) # make logical numeric
Pre-processing transformation (centering, scaling etc.) can be estimated from the training data and applied to any data set with the same variables. The operations are applied in this order: zero-variance filter, near-zero variance filter, correlation filter, Box-Cox/Yeo-Johnson/exponential transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign. k-nearest neighbor imputation is carried out by finding the k closest samples (Euclidian distance) in the training set. Imputation via bagging fits a bagged tree model for each predictor (as a function of all the others). This method is simple, accurate and accepts missing values, but it has much higher computational cost. Imputation via medians takes the median of each predictor in the training set, and uses them to fill missing values. This method is simple, fast, and accepts missing values, but treats each predictor independently, and may be inaccurate.
# Preprocess the data - imputation and scaling
preProcValues <- preProcess(a2_meta[, 7:28],
method = c("center",
"scale",
"YeoJohnson",
"knnImpute",
"nzv"),
thresh = 0.95, # cutoff for cumulative % of variance to be retained by PCA
pcaComp = NULL, #no. PCA components to keep. If specified, over-rides thresh
na.remove = TRUE, # should missing values be removed from the calculations
k = 5, # number of nearest neighbors from training set to use for imputation
knnSummary = mean, # function to average neighbor values/column during imputation
outcome = "outcome_BMI_fnl_BL",
fudge = 0.2, # a tolerance value: Box-Cox transformation lambda values
numUnique = 15, # no. unique values y has to estimate Box-Cox transformation
verbose = TRUE, # a logical: prints a log as the computations proceed
freqCut = 95/5, # cutoff for ratio of most to 2nd most common value.
uniqueCut = 10, # cutoff % of distinct values out of no. total samples
cutoff = 0.85, # a numeric value for the pair-wise absolute correlation cutoff.
rangeBounds = c(0, 1))
## applying them to training data
## Calculating 22 means for centering
## Calculating 22 standard deviations for scaling
a2_meta_Transformed <- cbind(a2_meta[, 1:6],
predict(preProcValues, a2_meta[,7:28]))
train_Transformed_a1 <- fread("/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim1/train_samples_standard_clinical.csv")
test_Transformed_a1 <- fread("/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim1/test_samples_standard_clinical.csv")
a1_T <- fread("/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim1/a1_meta_Transformed_standard_clinical.csv")
# Make training & testing to match the samples in training and testing of ANCOMBC
training_sample_names <- train_Transformed_a1$subject_id
testing_sample_names <- test_Transformed_a1$subject_id
# Filter rows in BL_clr that match training and testing samples
a2_training <- a2_meta_Transformed %>% filter(subject_id %in% training_sample_names)
a2_testing <- a2_meta_Transformed %>% filter(subject_id %in% testing_sample_names)
plot_normality <- function(data) {
# Select only numeric columns
numeric_data <- data[sapply(data, is.numeric)]
# Loop through each numeric column to create and print plots
for (col_name in names(numeric_data)) {
# Histogram
p1 <- ggplot(numeric_data, aes_string(col_name)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.5) +
labs(title = paste("Histogram of", col_name), x = col_name, y = "Frequency") +
theme_minimal()
print(p1) # Print the histogram
# Q-Q Plot
p2 <- ggplot(numeric_data, aes_string(sample = col_name)) +
stat_qq() +
stat_qq_line() +
labs(title = paste("Q-Q Plot of", col_name), x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
print(p2) # Print the Q-Q plot
}
}
plot_normality(a2_meta_Transformed[,5:28])
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#write.csv(a2_training,
# "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/a2_train_samples_standard_clinical.csv")
#write.csv(a2_testing,
# "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/a2_test_samples_standard_clinical.csv")
#write.csv(a2_meta_Transformed,
# "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/a2_meta_Transformed_standard_clinical.csv")
#write.csv(a2_meta,
# "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/a2_meta_not_Transformed_standard_clinical.csv")